Choosing the right complexity for a model


In [1]:
import numpy as np
import matplotlib.pyplot as plt

from sklearn.datasets import load_iris
from sklearn.cross_validation import  train_test_split


iris = load_iris()
X = iris.data
y = iris.target


# dataset for decision function visualization
X_2d = X[:, :2]
X_2d = X_2d[y > 0]
y_2d = y[y > 0]
y_2d -= 1

X_train, X_test, y_train, y_test = train_test_split(X_2d, y_2d)

In [2]:
%matplotlib inline
plt.scatter(X_train[:, 0], X_train[:, 1], c=y_train, s=100)


Out[2]:
<matplotlib.collections.PathCollection at 0x7f0bdc159610>

In [3]:
def show_decision_function(clf, ax):
    xx, yy = np.meshgrid(np.linspace(4.5, 8, 200), np.linspace(1.5, 4.0, 200))
    try:
        Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
    except AttributeError:
        Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 0]

    Z = Z.reshape(xx.shape)
    ax.pcolormesh(xx, yy, Z, cmap=plt.cm.jet)
    ax.set_xlim(4.5, 8)
    ax.set_ylim(1.5, 4.0)
    ax.set_xticks(())
    ax.set_yticks(())
    ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, s=100)

In [4]:
from sklearn.svm import SVC

training_scores = []
test_scores = []
fig, axes = plt.subplots(2, 3, figsize=(20, 10))
Cs = [0.01, 0.1, 1, 10, 100, 1000]

for C, ax in zip(Cs, axes.ravel()):
    clf = SVC(gamma=10, C=C)
    clf.fit(X_train, y_train)
    training_scores.append(clf.score(X_train, y_train))
    test_scores.append(clf.score(X_test, y_test))
    show_decision_function(clf, ax)

plt.savefig("iris_overfitting_decision.png", bbox_inches="tight")



In [5]:
plt.figure(figsize=(20, 10))
plt.plot(training_scores, label="training scores")
plt.plot(test_scores, label="test scores")
plt.legend(loc="best")
plt.xticks(range(6), Cs)
plt.savefig("iris_overfitting_curve.png", bbox_inches="tight")